A public data set of spatio-temporal match events in soccer competitions¶

Luca Pappalardo, Paolo Cintia, Alessio Rossi, Emanuele Massucco, Paolo Ferragina, Dino Pedreschi & Fosca Giannotti

Nature Scientific Data 6, Article number: 236 (2019)

if you use this code or the plots generated from it, please cite/mention the following papers:

  • Pappalardo, L., Cintia, P., Rossi, A. et al. A public data set of spatio-temporal match events in soccer competitions. Sci Data 6, 236 (2019) doi:10.1038/s41597-019-0247-7, https://www.nature.com/articles/s41597-019-0247-7

  • Pappalardo, L., Cintia, P., Ferragina, P., Massucco, E., Pedreschi, D., Giannotti, F. (2019) PlayeRank: Data-driven Performance Evaluation and Player Ranking in Soccer via a Machine Learning Approach. ACM Transactions on Intellingent Systems and Technologies 10(5) Article 59, DOI: https://doi.org/10.1145/3343172, https://dl.acm.org/citation.cfm?id=3343172

and the data collection on figshare:

  • Pappalardo, Luca; Massucco, Emanuele (2019): Soccer match event dataset. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.4415000

Import library¶

Here we import all the library useful to create plots.

In [ ]:
import json
from collections import Counter
import numpy as np
import operator
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib.patches import Ellipse
import seaborn as sns
import pandas as pd
import networkx as nx
import base64
from collections import defaultdict
import sys,os
import math
import random
import operator
import csv
import matplotlib.pylab as pyl
import itertools
import scipy as sp
from scipy import stats
from scipy import optimize
from scipy.integrate import quad

import warnings
warnings.filterwarnings('ignore')

Import data sets¶

Here we import the events data sets, the match data set, the players data set and the competition data set download the the figshare collection (find here the link to download the data: https://www.nature.com/articles/s41597-019-0247-7)

In [ ]:
# loading the events data
events={}
nations = ['Italy','England','Germany','France','Spain','European_Championship','World_Cup']
for nation in nations:
    with open('./data/events/events_%s.json' %nation) as json_data:
        events[nation] = json.load(json_data)
        
# loading the match data
matches={}
nations = ['Italy','England','Germany','France','Spain','European_Championship','World_Cup']
for nation in nations:
    with open('./data/matches/matches_%s.json' %nation) as json_data:
        matches[nation] = json.load(json_data)

# loading the players data
players={}
with open('./data/players.json') as json_data:
    players = json.load(json_data)

# loading the competitions data
competitions={}
with open('./data/competitions.json') as json_data:
    competitions = json.load(json_data)
In [ ]:
converted_list = []
for nation, event in events.items():
    for e in event:
        e['nation'] = nation
        converted_list.append(e)
In [ ]:
df_events = pd.DataFrame(converted_list)
df_events['x'] = df_events['positions'].apply(lambda pos: pos[0]['x'])
df_events['y'] = df_events['positions'].apply(lambda pos: 100 - pos[0]['y'])
df_events
Out[ ]:
eventId subEventName tags playerId positions matchId eventName teamId matchPeriod eventSec subEventId id nation x y
0 8 Simple pass [{'id': 1801}] 8327 [{'y': 52, 'x': 49}, {'y': 44, 'x': 43}] 2575959 Pass 3158 1H 2.530536 85 180423957 Italy 49 48
1 8 Simple pass [{'id': 1801}] 20438 [{'y': 44, 'x': 43}, {'y': 17, 'x': 36}] 2575959 Pass 3158 1H 3.768418 85 180423958 Italy 43 56
2 7 Touch [] 8306 [{'y': 17, 'x': 36}, {'y': 56, 'x': 78}] 2575959 Others on the ball 3158 1H 4.868265 72 180423959 Italy 36 83
3 1 Ground attacking duel [{'id': 504}, {'id': 703}, {'id': 1801}] 8306 [{'y': 56, 'x': 78}, {'y': 15, 'x': 64}] 2575959 Duel 3158 1H 8.114676 11 180423960 Italy 78 44
4 1 Ground attacking duel [{'id': 503}, {'id': 703}, {'id': 1801}] 8306 [{'y': 15, 'x': 64}, {'y': 15, 'x': 72}] 2575959 Duel 3158 1H 8.647892 11 180423961 Italy 64 85
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3251289 8 Simple pass [{'id': 1801}] 3476 [{'y': 20, 'x': 46}, {'y': 6, 'x': 64}] 2058017 Pass 9598 2H 2978.301867 85 263885652 World_Cup 46 80
3251290 7 Touch [] 14812 [{'y': 6, 'x': 64}, {'y': 2, 'x': 82}] 2058017 Others on the ball 9598 2H 2979.084611 72 263885653 World_Cup 64 94
3251291 8 Cross [{'id': 401}, {'id': 801}, {'id': 1802}] 14812 [{'y': 2, 'x': 82}, {'y': 100, 'x': 100}] 2058017 Pass 9598 2H 2983.448628 80 263885654 World_Cup 82 98
3251292 4 Goalkeeper leaving line [] 25381 [{'y': 0, 'x': 0}, {'y': 98, 'x': 18}] 2058017 Goalkeeper leaving line 4418 2H 2985.869275 40 263885613 World_Cup 0 100
3251293 8 Launch [{'id': 1802}] 25381 [{'y': 43, 'x': 14}, {'y': 0, 'x': 0}] 2058017 Pass 4418 2H 3002.148765 84 263885618 World_Cup 14 57

3251294 rows × 15 columns

In [ ]:
for item in players:
    if 'role' in item and 'name' in item['role']:  
        item['role'] = item['role']['name']  
In [ ]:
players_df = pd.DataFrame(players)
In [ ]:
players_df
Out[ ]:
passportArea weight firstName middleName lastName currentTeamId birthDate height role birthArea wyId foot shortName currentNationalTeamId
0 {'name': 'Turkey', 'id': '792', 'alpha3code': ... 78 Harun Tekin 4502 1989-06-17 187 Goalkeeper {'name': 'Turkey', 'id': '792', 'alpha3code': ... 32777 right H. Tekin 4687
1 {'name': 'Senegal', 'id': '686', 'alpha3code':... 73 Malang Sarr 3775 1999-01-23 182 Defender {'name': 'France', 'id': '250', 'alpha3code': ... 393228 left M. Sarr 4423
2 {'name': 'France', 'id': '250', 'alpha3code': ... 72 Over Mandanda 3772 1998-10-26 176 Goalkeeper {'name': 'France', 'id': '250', 'alpha3code': ... 393230 O. Mandanda null
3 {'name': 'Senegal', 'id': '686', 'alpha3code':... 82 Alfred John Momar N'Diaye 683 1990-03-06 187 Midfielder {'name': 'France', 'id': '250', 'alpha3code': ... 32793 right A. N'Diaye 19314
4 {'name': 'France', 'id': '250', 'alpha3code': ... 84 Ibrahima Konat\u00e9 2975 1999-05-25 192 Defender {'name': 'France', 'id': '250', 'alpha3code': ... 393247 right I. Konat\u00e9 null
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3598 {'name': 'Tunisia', 'id': 788, 'alpha3code': '... 72 Ali Ma\u00e2loul 16041 1990-01-01 175 Defender {'name': 'Tunisia', 'id': 788, 'alpha3code': '... 120839 left A. Ma\u00e2loul null
3599 {'name': 'Peru', 'id': 604, 'alpha3code': 'PER... 76 Carlos Alberto C\u00e1ceda Oyaguez 15591 1991-09-27 183 Goalkeeper {'name': 'Peru', 'id': 604, 'alpha3code': 'PER... 114736 right C. C\u00e1ceda null
3600 {'name': 'Peru', 'id': 604, 'alpha3code': 'PER... 78 Miguel Gianpierre Araujo Blanco 12072 1994-10-24 179 Defender {'name': 'Peru', 'id': 604, 'alpha3code': 'PER... 114908 right M. Araujo null
3601 {'name': 'Morocco', 'id': 504, 'alpha3code': '... 70 Ahmed Reda Tagnaouti 16183 1996-04-05 182 Goalkeeper {'name': 'Morocco', 'id': 504, 'alpha3code': '... 285583 right A. Tagnaouti null
3602 {'name': 'Panama', 'id': 591, 'alpha3code': 'P... 0 Ricardo Guardia Avila 62943 1997-02-04 0 Midfielder {'name': 'Panama', 'id': 591, 'alpha3code': 'P... 361536 left R. Avila null

3603 rows × 14 columns

In [ ]:
merged_df = pd.merge(df_events, players_df, left_on='playerId', right_on='wyId', how='inner')
In [ ]:
merged_df=merged_df[['eventId', 'eventName', 'playerId', 'subEventName', 'tags', 'role', 'nation', 'x', 'y']]
In [ ]:
events_to_keep = ['Pass', 'Duel', 'Shot', 'Others on the ball', 'Free Kick']
merged_df_preprocessed1 = merged_df[merged_df['eventName'].isin(events_to_keep)]
In [ ]:
def determine_category1(row):
    if row['eventName'] == 'Others on the ball':
        if row['subEventName'].lower() == 'acceleration':
            return 'Others on the ball - Acceleration'
        elif row['subEventName'].lower() == 'clearance':
            return 'Others on the ball - Clearance'
        else:
            return 'Drop Row'
    else:
        return row['eventName']

merged_df_preprocessed1['category1'] = merged_df_preprocessed1.apply(determine_category1, axis=1)
In [ ]:
merged_df_preprocessed1
Out[ ]:
eventId eventName playerId subEventName tags role nation x y category1
0 8 Pass 8327 Simple pass [{'id': 1801}] Forward Italy 49 48 Pass
1 1 Duel 8327 Ground attacking duel [{'id': 602}, {'id': 703}, {'id': 1801}] Forward Italy 72 75 Duel
2 1 Duel 8327 Ground attacking duel [{'id': 701}, {'id': 1802}] Forward Italy 82 36 Duel
3 1 Duel 8327 Ground attacking duel [{'id': 702}, {'id': 1801}] Forward Italy 71 71 Duel
4 1 Duel 8327 Ground attacking duel [{'id': 602}, {'id': 701}, {'id': 1802}] Forward Italy 72 26 Duel
... ... ... ... ... ... ... ... ... ... ...
3025251 8 Pass 70379 Simple pass [{'id': 1801}] Midfielder World_Cup 71 4 Pass
3025252 1 Duel 70379 Ground attacking duel [{'id': 502}, {'id': 703}, {'id': 1801}] Midfielder World_Cup 78 10 Duel
3025253 8 Pass 70379 Cross [{'id': 401}, {'id': 801}, {'id': 1802}] Midfielder World_Cup 76 20 Pass
3025254 8 Pass 70379 Simple pass [{'id': 1801}] Midfielder World_Cup 70 32 Pass
3025255 8 Pass 70379 Simple pass [{'id': 1801}] Midfielder World_Cup 75 3 Pass

2942586 rows × 10 columns

In [ ]:
def determine_category2(row):
    if row['eventName'] == 'Pass':
        if row['role'] == 'Forward':
            return 'Pass Forward'
        elif row['role'] == 'Midfielder':
            return 'Pass Midfielder'
        else:
            return 'Pass Defender'
    else:
        return row['eventName']
In [ ]:
merged_df_preprocessed1['category2'] = merged_df_preprocessed1.apply(determine_category2, axis=1)
In [ ]:
merged_df_preprocessed1
Out[ ]:
eventId eventName playerId subEventName tags role nation x y category1 category2
0 8 Pass 8327 Simple pass [{'id': 1801}] Forward Italy 49 48 Pass Pass Forward
1 1 Duel 8327 Ground attacking duel [{'id': 602}, {'id': 703}, {'id': 1801}] Forward Italy 72 75 Duel Duel
2 1 Duel 8327 Ground attacking duel [{'id': 701}, {'id': 1802}] Forward Italy 82 36 Duel Duel
3 1 Duel 8327 Ground attacking duel [{'id': 702}, {'id': 1801}] Forward Italy 71 71 Duel Duel
4 1 Duel 8327 Ground attacking duel [{'id': 602}, {'id': 701}, {'id': 1802}] Forward Italy 72 26 Duel Duel
... ... ... ... ... ... ... ... ... ... ... ...
3025251 8 Pass 70379 Simple pass [{'id': 1801}] Midfielder World_Cup 71 4 Pass Pass Midfielder
3025252 1 Duel 70379 Ground attacking duel [{'id': 502}, {'id': 703}, {'id': 1801}] Midfielder World_Cup 78 10 Duel Duel
3025253 8 Pass 70379 Cross [{'id': 401}, {'id': 801}, {'id': 1802}] Midfielder World_Cup 76 20 Pass Pass Midfielder
3025254 8 Pass 70379 Simple pass [{'id': 1801}] Midfielder World_Cup 70 32 Pass Pass Midfielder
3025255 8 Pass 70379 Simple pass [{'id': 1801}] Midfielder World_Cup 75 3 Pass Pass Midfielder

2942586 rows × 11 columns

Functions¶

Here we create all the functions usefull to create the plots.

In [ ]:
merged_df_preprocessed1=merged_df_preprocessed1.sample(300000)
In [ ]:
merged_df_preprocessed1.category1.unique()
Out[ ]:
array(['Duel', 'Pass', 'Drop Row', 'Free Kick',
       'Others on the ball - Clearance', 'Shot',
       'Others on the ball - Acceleration'], dtype=object)
In [ ]:
#Pass, Shot, Duel, Free Kick, Others on the ball--Acceleration, Others on the ball--Clearance
In [ ]:
merged_df_preprocessed1.columns
Out[ ]:
Index(['eventId', 'eventName', 'playerId', 'subEventName', 'tags', 'role',
       'nation', 'x', 'y', 'category1', 'category2'],
      dtype='object')
In [ ]:
import altair as alt
import pandas as pd

alt.data_transformers.disable_max_rows()

def draw_pitch_altair():
    pitch_elements = []

    pitch_elements.append({'start': [0, 0], 'end': [100, 0]})
    pitch_elements.append({'start': [100, 0], 'end': [100, 100]})
    pitch_elements.append({'start': [100, 100], 'end': [0, 100]})
    pitch_elements.append({'start': [0, 100], 'end': [0, 0]})
    pitch_elements.append({'start': [50, 0], 'end': [50, 100]})

    pitch_elements.append({'start': [17, 25], 'end': [17, 75]})
    pitch_elements.append({'start': [17, 25], 'end': [0, 25]})
    pitch_elements.append({'start': [17, 75], 'end': [0, 75]})

    pitch_elements.append({'start': [100, 25], 'end': [83, 25]})
    pitch_elements.append({'start': [83, 25], 'end': [83, 75]})
    pitch_elements.append({'start': [100, 75], 'end': [83, 75]})

    center_circle = alt.Chart(pd.DataFrame({'x': [50], 'y': [50]})).mark_circle(
        size=3000,
        stroke='black',
        strokeWidth=2,
        fill=None
    ).encode(
        x='x:Q',
        y='y:Q'
    ).properties(
        width=600,
        height=400
    )

    pitch_df = pd.DataFrame(pitch_elements)
    lines = alt.Chart(pitch_df).mark_rule(strokeWidth=2).encode(
        x='start[0]:Q',
        y='start[1]:Q',
        x2='end[0]:Q',
        y2='end[1]:Q',
        color=alt.value('black')
    )

    return alt.layer(lines, center_circle).properties(width=700, height=500)


def create_heatmap(df, category, colors):
    heatmap = alt.Chart(df.query(f"category1 == '{category}'")).mark_rect().encode(
        alt.X('x:Q', bin=alt.Bin(maxbins=100)),
        alt.Y('y:Q', bin=alt.Bin(maxbins=100)),
        color=alt.Color('count()', scale=alt.Scale(scheme=colors)),
    ).properties(width=700, height=500)
    return heatmap

nation_selector = alt.selection_single(
    fields=['nation'],
    name="Select Nation", 
    bind=alt.binding_select(options=[None] + sorted(list(merged_df_preprocessed1['nation'].unique()))),
    empty='all'
)

more_detail_selector = alt.selection_single(
    fields=['category2'],
    name="See More Detail", 
    bind=alt.binding_select(options=[None, 'Pass Defender', 'Pass Midfielder', 'Pass Forward']),
    empty='all'
)


def create_interactive_heatmap(df, category, colors, detail_selector=None):
    heatmap = create_heatmap(df, category, colors)
    
    filtered_heatmap = heatmap.transform_filter(
        nation_selector
    )

    if detail_selector:
        filtered_heatmap = filtered_heatmap.transform_filter(
            detail_selector
        )
    
    final_chart = alt.layer(
        filtered_heatmap,
        draw_pitch_altair()
    ).add_selection(
        nation_selector
    ).properties(
        width=700,
        height=500,
        title=f"{category} Events by Nation"
    )
    
    if category == 'Pass':
        final_chart = final_chart.add_selection(
            more_detail_selector
        )
    
    return final_chart

final_charts = []
activity_types = ['Shot', 'Pass', 'Duel', 'Free Kick', 'Others on the ball - Acceleration', 'Others on the ball - Clearance']
for activity in activity_types:
    if activity == 'Pass':
        heatmap = create_interactive_heatmap(merged_df_preprocessed1, activity, 'greens', more_detail_selector)
    else:
        heatmap = create_interactive_heatmap(merged_df_preprocessed1, activity, 'greens')
    
    final_chart = heatmap.properties(width=700, height=500, title=activity)
    final_charts.append(final_chart)

for i in range(len(final_charts)):
    final_charts[i].display()